## Loading required modules
options(warn=-1)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
library(ggplot2)
## Loading the survey data
df <- read.csv('kaggle_survey_2020_responses.csv')
dim(df)
## [1] 20037 355
str(df)
## 'data.frame': 20037 obs. of 355 variables:
## $ Time.from.Start.to.Finish..seconds.: chr "Duration (in seconds)" "1838" "289287" "860" ...
## $ Q1 : chr "What is your age (# years)?" "35-39" "30-34" "35-39" ...
## $ Q2 : chr "What is your gender? - Selected Choice" "Man" "Man" "Man" ...
## $ Q3 : chr "In which country do you currently reside?" "Colombia" "United States of America" "Argentina" ...
## $ Q4 : chr "What is the highest level of formal education that you have attained or plan to attain within the next 2 years?" "Doctoral degree" "Master’s degree" "Bachelor’s degree" ...
## $ Q5 : chr "Select the title most similar to your current role (or most recent title if retired): - Selected Choice" "Student" "Data Engineer" "Software Engineer" ...
## $ Q6 : chr "For how many years have you been writing code and/or programming?" "5-10 years" "5-10 years" "10-20 years" ...
## $ Q7_Part_1 : chr "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python" "Python" "Python" "" ...
## $ Q7_Part_2 : chr "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - R" "R" "R" "" ...
## $ Q7_Part_3 : chr "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - SQL" "SQL" "SQL" "" ...
## $ Q7_Part_4 : chr "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - C" "C" "" "" ...
## $ Q7_Part_5 : chr "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - C++" "" "" "" ...
## $ Q7_Part_6 : chr "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Java" "" "" "Java" ...
## $ Q7_Part_7 : chr "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Javascript" "Javascript" "" "Javascript" ...
## $ Q7_Part_8 : chr "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Julia" "" "" "" ...
## $ Q7_Part_9 : chr "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Swift" "" "" "" ...
## $ Q7_Part_10 : chr "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Bash" "" "" "Bash" ...
## $ Q7_Part_11 : chr "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - MATLAB" "MATLAB" "" "" ...
## $ Q7_Part_12 : chr "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - None" "" "" "" ...
## $ Q7_OTHER : chr "What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Other" "Other" "" "" ...
## $ Q8 : chr "What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice" "Python" "Python" "R" ...
## $ Q9_Part_1 : chr "Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all "| __truncated__ "Jupyter (JupyterLab, Jupyter Notebooks, etc) " "" "" ...
## $ Q9_Part_2 : chr "Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all "| __truncated__ "" "" "" ...
## $ Q9_Part_3 : chr "Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all "| __truncated__ "" "Visual Studio" "" ...
## $ Q9_Part_4 : chr "Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all "| __truncated__ "Visual Studio Code (VSCode)" "" "Visual Studio Code (VSCode)" ...
## $ Q9_Part_5 : chr "Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all "| __truncated__ "" " PyCharm " "" ...
## $ Q9_Part_6 : chr "Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all "| __truncated__ " Spyder " "" "" ...
## $ Q9_Part_7 : chr "Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all "| __truncated__ "" "" " Notepad++ " ...
## $ Q9_Part_8 : chr "Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all "| __truncated__ "" " Sublime Text " " Sublime Text " ...
## $ Q9_Part_9 : chr "Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all "| __truncated__ "" "" " Vim / Emacs " ...
## $ Q9_Part_10 : chr "Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all "| __truncated__ "" "" "" ...
## $ Q9_Part_11 : chr "Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all "| __truncated__ "" "" "" ...
## $ Q9_OTHER : chr "Which of the following integrated development environments (IDE's) do you use on a regular basis? (Select all "| __truncated__ "" "" "" ...
## $ Q10_Part_1 : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ " Kaggle Notebooks" "" "" ...
## $ Q10_Part_2 : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "Colab Notebooks" "Colab Notebooks" "" ...
## $ Q10_Part_3 : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "" "" "" ...
## $ Q10_Part_4 : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "" "" "" ...
## $ Q10_Part_5 : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "" "" "" ...
## $ Q10_Part_6 : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "" "" "" ...
## $ Q10_Part_7 : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "" "" "" ...
## $ Q10_Part_8 : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "" "" "" ...
## $ Q10_Part_9 : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "" "" "" ...
## $ Q10_Part_10 : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "" "" "" ...
## $ Q10_Part_11 : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "" "" "" ...
## $ Q10_Part_12 : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "" "" "" ...
## $ Q10_Part_13 : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "" "" "None" ...
## $ Q10_OTHER : chr "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "" "" "" ...
## $ Q11 : chr "What type of computing platform do you use most often for your data science projects? - Selected Choice" "A cloud computing platform (AWS, Azure, GCP, hosted notebooks, etc)" "A personal computer or laptop" "A personal computer or laptop" ...
## $ Q12_Part_1 : chr "Which types of specialized hardware do you use on a regular basis? (Select all that apply) - Selected Choice - GPUs" "GPUs" "GPUs" "" ...
## $ Q12_Part_2 : chr "Which types of specialized hardware do you use on a regular basis? (Select all that apply) - Selected Choice - TPUs" "" "" "" ...
## $ Q12_Part_3 : chr "Which types of specialized hardware do you use on a regular basis? (Select all that apply) - Selected Choice - None" "" "" "None" ...
## $ Q12_OTHER : chr "Which types of specialized hardware do you use on a regular basis? (Select all that apply) - Selected Choice - Other" "" "" "" ...
## $ Q13 : chr "Approximately how many times have you used a TPU (tensor processing unit)?" "2-5 times" "2-5 times" "Never" ...
## $ Q14_Part_1 : chr "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected C"| __truncated__ " Matplotlib " " Matplotlib " "" ...
## $ Q14_Part_2 : chr "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected C"| __truncated__ "" " Seaborn " "" ...
## $ Q14_Part_3 : chr "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected C"| __truncated__ "" "" "" ...
## $ Q14_Part_4 : chr "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected C"| __truncated__ "" " Ggplot / ggplot2 " "" ...
## $ Q14_Part_5 : chr "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected Choice - Shiny " "" " Shiny " "" ...
## $ Q14_Part_6 : chr "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected Choice - D3 js " "" "" " D3 js " ...
## $ Q14_Part_7 : chr "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected C"| __truncated__ "" "" "" ...
## $ Q14_Part_8 : chr "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected Choice - Bokeh " "" "" "" ...
## $ Q14_Part_9 : chr "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected C"| __truncated__ " Geoplotlib " "" "" ...
## $ Q14_Part_10 : chr "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected C"| __truncated__ "" "" "" ...
## $ Q14_Part_11 : chr "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected Choice - None" "" "" "" ...
## $ Q14_OTHER : chr "What data visualization libraries or tools do you use on a regular basis? (Select all that apply) - Selected Choice - Other" "" "" "" ...
## $ Q15 : chr "For how many years have you used machine learning methods?" "1-2 years" "1-2 years" "I do not use machine learning methods" ...
## $ Q16_Part_1 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" " Scikit-learn " "" ...
## $ Q16_Part_2 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ " TensorFlow " " TensorFlow " "" ...
## $ Q16_Part_3 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ " Keras " " Keras " "" ...
## $ Q16_Part_4 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" " PyTorch " "" ...
## $ Q16_Part_5 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
## $ Q16_Part_6 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
## $ Q16_Part_7 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ " Xgboost " "" "" ...
## $ Q16_Part_8 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
## $ Q16_Part_9 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
## $ Q16_Part_10 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
## $ Q16_Part_11 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
## $ Q16_Part_12 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
## $ Q16_Part_13 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
## $ Q16_Part_14 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
## $ Q16_Part_15 : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
## $ Q16_OTHER : chr "Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Sel"| __truncated__ "" "" "" ...
## $ Q17_Part_1 : chr "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "" "Linear or Logistic Regression" "" ...
## $ Q17_Part_2 : chr "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "Decision Trees or Random Forests" "" "" ...
## $ Q17_Part_3 : chr "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "Gradient Boosting Machines (xgboost, lightgbm, etc)" "" "" ...
## $ Q17_Part_4 : chr "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "Bayesian Approaches" "" "" ...
## $ Q17_Part_5 : chr "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "" "" "" ...
## $ Q17_Part_6 : chr "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "Dense Neural Networks (MLPs, etc)" "" "" ...
## $ Q17_Part_7 : chr "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "Convolutional Neural Networks" "Convolutional Neural Networks" "" ...
## $ Q17_Part_8 : chr "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "" "" "" ...
## $ Q17_Part_9 : chr "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "Recurrent Neural Networks" "" "" ...
## $ Q17_Part_10 : chr "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice "| __truncated__ "" "Transformer Networks (BERT, gpt-3, etc)" "" ...
## $ Q17_Part_11 : chr "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice - None" "" "" "" ...
## $ Q17_OTHER : chr "Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice - Other" "" "" "" ...
## $ Q18_Part_1 : chr "Which categories of computer vision methods do you use on a regular basis? (Select all that apply) - Selected "| __truncated__ "" "" "" ...
## $ Q18_Part_2 : chr "Which categories of computer vision methods do you use on a regular basis? (Select all that apply) - Selected "| __truncated__ "" "Image segmentation methods (U-Net, Mask R-CNN, etc)" "" ...
## $ Q18_Part_3 : chr "Which categories of computer vision methods do you use on a regular basis? (Select all that apply) - Selected "| __truncated__ "" "" "" ...
## $ Q18_Part_4 : chr "Which categories of computer vision methods do you use on a regular basis? (Select all that apply) - Selected "| __truncated__ "Image classification and other general purpose networks (VGG, Inception, ResNet, ResNeXt, NASNet, EfficientNet, etc)" "Image classification and other general purpose networks (VGG, Inception, ResNet, ResNeXt, NASNet, EfficientNet, etc)" "" ...
## $ Q18_Part_5 : chr "Which categories of computer vision methods do you use on a regular basis? (Select all that apply) - Selected "| __truncated__ "" "" "" ...
## [list output truncated]
## Loading questions dataframe
q_df <- read.csv('questions_dataframe.csv')
dim(q_df)
## [1] 354 7
str(q_df)
## 'data.frame': 354 obs. of 7 variables:
## $ ques_num : chr "Q1" "Q10" "Q10" "Q10" ...
## $ q_header : chr "Q1" "Q10_Part_1" "Q10_Part_2" "Q10_Part_3" ...
## $ ques_keys : chr "Q1" "Q10" "Q10" "Q10" ...
## $ ques_type : chr "single_answer" "multiple_answer" "multiple_answer" "multiple_answer" ...
## $ question : chr "What is your age (# years)?" "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ "Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) - Selec"| __truncated__ ...
## $ missing_perc: num 0 14.2 14.2 14.2 14.2 ...
## $ tag : chr "age" "Notebook Products" "Notebook Products" "Notebook Products" ...
## Creating Salary Buckets
df[df==""] <- NA
dat <- df[!is.na(df$Q24),]
dat <- dat[2:nrow(dat),]
dat$Time.from.Start.to.Finish..seconds. <- NULL
dim(dat)
## [1] 10729 354
v_low <- c('$0-999', '1,000-1,999', '2,000-2,999', '3,000-3,999', "4,000-4,999", '5,000-7,499',
'7,500-9,999', '10,000-14,999', '15,000-19,999')
low <- c('20,000-24,999','25,000-29,999', '30,000-39,999','40,000-49,999')
medium <- c('50,000-59,999','60,000-69,999', '70,000-79,999','80,000-89,999', '90,000-99,999')
high <- c('100,000-124,999', '125,000-149,999')
v_high <- c('150,000-199,999', '200,000-249,999','250,000-299,999', '300,000-500,000', '> $500,000')
dat$Q24 <- ifelse(dat$Q24 %in% v_low, 'very low',
ifelse(dat$Q24 %in% low, 'low',
ifelse(dat$Q24 %in% medium, 'medium',
ifelse(dat$Q24 %in% high, 'high', 'very high'))))
dat %>% count(dat[,'Q24'],sort = T)
## dat[, "Q24"] n
## 1 very low 5555
## 2 medium 1865
## 3 low 1806
## 4 high 888
## 5 very high 615
unique(q_df[,c('ques_keys','tag')])
## ques_keys tag
## 1 Q1 age
## 2 Q10 Notebook Products
## 16 Q11 Computing Platform
## 17 Q12 Specialized Hardware
## 21 Q13 TPU Usage
## 22 Q14 Visualisation Tools
## 34 Q15 Machine Learning Experience
## 35 Q16 ML Framework
## 51 Q17 ML Algorithms
## 63 Q18 Computer Vision Methods
## 70 Q19 NLP Methods
## 76 Q2 gender
## 77 Q20 Company Size
## 78 Q21 Data Science Team Size
## 79 Q22 ML in production
## 80 Q23 Work Responsibilites
## 88 Q24 Salary
## 89 Q25 ML & Cloud Spend
## 90 Q26_A Cloud Computing Platforms
## 102 Q26_B Cloud Computing Platforms
## 114 Q27_A Cloud Computing Products
## 126 Q27_B Cloud Computing Products
## 138 Q28_A Machine Learning Products
## 149 Q28_B Machine Learning Products
## 160 Q29_A Big Data Products
## 178 Q29_B Big Data Products
## 196 Q3 country of residence
## 197 Q30 Big Data Products Most Often
## 198 Q31_A BI Tools
## 213 Q31_B BI Tools
## 228 Q32 BI Tools Most Often
## 229 Q33_A Auto ML Areas
## 237 Q33_B Auto ML Areas
## 245 Q34_A Auto ML Tools
## 257 Q34_B Auto ML Tools
## 269 Q35_A Experiments Tracking
## 280 Q35_B Experiments Tracking
## 291 Q36 Analysis Sharing Platforms
## 301 Q37 DS Course Platforms
## 313 Q38 Data Analysis Tools
## 314 Q39 Data Science Media Sources
## 326 Q4 education
## 327 Q5 current role
## 328 Q6 coding experience
## 329 Q7 programming languages
## 342 Q8 programming language recommendation
## 343 Q9 IDE
## Helper Functions
# Calculating percentage share for a category
perc_share<- function(x){
s <- sum(x)
r <- round((x/s)*100,2)
return(r)
}
# Computing contingency table for 2 categorical vars & converting counts to percetage values
contingency_table_pct <- function(ques_num_1, ques_num_2 = 'Q24'){
if(q_df$ques_type[q_df$ques_keys == ques_num_1][1] == 'single_answer'){
cross_tab <- table(dat[,ques_num_1],dat[,ques_num_2])
}
else{
list_of_columns <- q_df$q_header[q_df$ques_keys == ques_num_1]
cross_tab <- table(dat[,list_of_columns[1]], dat[,ques_num_2])
for(col in list_of_columns[2:length(list_of_columns)]){
result <- table(dat[,col], dat[,ques_num_2])
cross_tab = rbind(cross_tab,result)
}
}
cross_tab <- cross_tab[,c('very low', 'low', 'medium', 'high', 'very high')]
cross_tab_pct <- apply(cross_tab,2,perc_share)
return(cross_tab_pct)
}
# Side by side bar plots
ss_barplot <- function(x, title, xlabel, ylabel){
a <- data.frame(x)
a$index <- row.names(a)
a <- melt(a)
p <- ggplot(a, aes(x=variable, y=value, fill=index)) +
geom_bar(stat='identity', position='dodge')
p <- p + ggtitle(title) + labs(y=ylabel, x = xlabel)
return(p)
}
# Multiple line plots
line_charts <- function(x, title, xlabel, ylabel){
a <- data.frame(x)
a$index <- row.names(a)
a <- melt(a)
p <- ggplot(a, aes(x=variable, y=value, group = index,color = index)) + geom_line()
p <- p + ggtitle(title) + labs(y=ylabel, x = xlabel)
return(p)
}
## Salary vs formal education (Q4)
cross_tab_pct = contingency_table_pct('Q4', 'Q24')
row.names(cross_tab_pct)[row.names(cross_tab_pct) == 'Some college/university study without earning a bachelor’s degree'] <- 'College experience'
row.names(cross_tab_pct)[row.names(cross_tab_pct) == 'No formal education past high school'] <- 'High School'
title <- 'Education Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
While Master’s degree appears commonly across all income brackets, number of people with just a bachelor’s degree declines from low to high income groups. The number of doctorates see a steady rise from low to high income groups.
## Salary vs DS Course Platform (Q37)
cross_tab_pct = contingency_table_pct('Q37', 'Q24')
row.names(cross_tab_pct)[row.names(cross_tab_pct) == 'Cloud-certification programs (direct from AWS, Azure, GCP, or similar)'] <- 'Cloud-certification programs'
row.names(cross_tab_pct)[row.names(cross_tab_pct) == 'University Courses (resulting in a university degree)'] <- 'University Courses'
title <- 'DS Course Platform Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
Popular course platforms (Coursera, edX, Fast.ai) providing quality content in complex areas like deep learning see steady rise from low to high income groups. While platforms with starter courses like Kaggle learn and Udemy observe more popularity among low income groups.
## Salary vs Analysis Sharing Platforms (Q36)
cross_tab_pct = contingency_table_pct('Q36', 'Q24')
title <- 'Analysis Sharing Platforms Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
GitHub remains the most popular choice across salary brackets for sharing analysis. However, with increase in income, activity on public platforms (kaggle, colab etc.) goes down.
## Salary vs DS Media Sources (Q39)
cross_tab_pct = contingency_table_pct('Q39', 'Q24')
title <- 'DS Media Sources Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
The high income bracket relies on Data Science experts to keep up with the industry trends. They do this by reading Journal publications, following people on twitter, signing up for email newsletters, podcasts and blogs. They avoid community platforms like youtube, slack communities or course forums.
## Salary vs Country of Residence (Q3)
countries <- c('India', 'United Kingdom of Great Britain and Northern Ireland',
'United States of America', 'Brazil', ' Japan', 'Russia', 'Other',
'Nigeria', 'China', 'Germany')
country_data <- dat[dat$Q3 %in% countries,]
cross_tab <- table(country_data[,'Q3'],country_data[,'Q24'])
cross_tab_pct <- apply(cross_tab,2,perc_share)
row.names(cross_tab_pct)[row.names(cross_tab_pct) == 'United Kingdom of Great Britain and Northern Ireland'] <- 'UK'
row.names(cross_tab_pct)[row.names(cross_tab_pct) == 'United States of America'] <- 'USA'
title <- 'Country of Residence Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
The USA is the place to be for Data Science professionals followed by developed countries like the UK and Germany. Develping countries like India, China etc. have less high paying roles.
## Salary vs Company Size (Q20)
cross_tab_pct = contingency_table_pct('Q20', 'Q24')
title <- 'Company Size Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
Large companies with 1000 or more personnel are the best places to work for high income.
## Salary vs DS Team Size (Q21)
cross_tab_pct = contingency_table_pct('Q21', 'Q24')
title <- 'DS Team Size Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
Large Data Science Teams (>15) are the go to places for high incomes.
## Salary vs ML in Production (Q22)
cross_tab_pct = contingency_table_pct('Q22', 'Q24')
title <- 'ML in Production Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
Places using ML in production pay more than the ones not using it.
## Salary vs current role (Q5)
cross_tab_pct = contingency_table_pct('Q5', 'Q24')
title <- 'current role Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
Data Scientist and Product Manager are the highest paying roles.
## Salary vs coding experience (Q6)
cross_tab_pct = contingency_table_pct('Q6', 'Q24')
title <- 'coding experience Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
Coding experience pays. More experience translates to more pay.
## Salary vs Machine Learning Experience (Q15)
cross_tab_pct = contingency_table_pct('Q15', 'Q24')
title <- 'Machine Learning Experience Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
ML Experience pays. More experience translates to more money.
## Salary vs Work Responsibilities (Q23)
cross_tab_pct = contingency_table_pct('Q23', 'Q24')
title <- 'Work Responsibilities Vs Salary'
xlabel <- 'salary_bracket'
ylabel <- 'perc_share'
ss_barplot(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
line_charts(cross_tab_pct,title, xlabel, ylabel)
## Using index as id variables
Using machine learning to generate value for companies by applying it in novel areas or improving existing systems pays the most.